credit_score <- dbSendQuery(con, "SELECT * FROM credit_score")
credit_score <- dbFetch(credit_score)
for (col in 1:ncol(credit_score)) {
colnames(credit_score)[col] <- tolower(colnames(credit_score)[col])
}
credit_score <- subset(credit_score, select = -c(index, id))
categories <- c("sex", "education", "marriage", "default")
for (col in categories) {
credit_score[, col] <- as.factor(credit_score[, col])
}
numerical <- names(subset(credit_score, select = -c(sex, education, marriage, default)))
for (n in numerical) {
credit_score[, n] <- as.numeric(unlist(credit_score[, n]))
}
head(credit_score, 10)
## limit_bal sex education marriage age pay_0 pay_2 pay_3 pay_4 pay_5 pay_6
## 1 20000 2 2 1 24 2 2 0 0 0 0
## 2 120000 2 2 2 26 0 2 0 0 0 2
## 3 90000 2 2 2 34 0 0 0 0 0 0
## 4 50000 2 2 1 37 0 0 0 0 0 0
## 5 50000 1 2 1 57 0 0 0 0 0 0
## 6 50000 1 1 2 37 0 0 0 0 0 0
## 7 500000 1 1 2 29 0 0 0 0 0 0
## 8 100000 2 2 2 23 0 0 0 0 0 0
## 9 140000 2 3 1 28 0 0 2 0 0 0
## 10 20000 1 3 2 35 0 0 0 0 0 0
## bill_amt1 bill_amt2 bill_amt3 bill_amt4 bill_amt5 bill_amt6 pay_amt1
## 1 3913 3102 689 0 0 0 0
## 2 2682 1725 2682 3272 3455 3261 0
## 3 29239 14027 13559 14331 14948 15549 1518
## 4 46990 48233 49291 28314 28959 29547 2000
## 5 8617 5670 35835 20940 19146 19131 2000
## 6 64400 57069 57608 19394 19619 20024 2500
## 7 367965 412023 445007 542653 483003 473944 55000
## 8 11876 380 601 221 -159 567 380
## 9 11285 14096 12108 12211 11793 3719 3329
## 10 0 0 0 0 13007 13912 0
## pay_amt2 pay_amt3 pay_amt4 pay_amt5 pay_amt6 default
## 1 689 0 0 0 0 1
## 2 1000 1000 1000 0 2000 1
## 3 1500 1000 1000 1000 5000 0
## 4 2019 1200 1100 1069 1000 0
## 5 36681 10000 9000 689 679 0
## 6 1815 657 1000 1000 800 0
## 7 40000 38000 20239 13750 13770 0
## 8 601 0 581 1687 1542 0
## 9 0 432 1000 1000 1000 0
## 10 0 0 13007 1122 0 0
First, I have to admit that this is a synthetical dataset, which means that there are no missing values, outliers, errors or any other mistakes, so these examinations will be skipped.
credit_score$limit_bal <- log(credit_score$limit_bal)
p1 <- ggplot(credit_score, aes(x = limit_bal)) + geom_histogram()
ggplotly(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
credit_score$age <- log(credit_score$age)
p2 <- ggplot(credit_score, aes(x = age)) + geom_histogram()
ggplotly(p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p3 <- ggplot(credit_score, aes(x=sex, y=limit_bal)) + geom_boxplot()
ggplotly(p3)
p4 <- ggplot(credit_score, aes(x=education, y=limit_bal)) + geom_boxplot()
ggplotly(p4)
p5 <- ggplot(credit_score, aes(x=marriage, y=limit_bal)) + geom_boxplot()
ggplotly(p5)
t <- (table(credit_score$default,credit_score$sex))
t <- as.data.frame(t)
colnames(t) <- c('default', 'sex', 'cnt')
p6 <-
ggplot(t, aes(x = default, y = cnt, fill = sex)) + geom_bar(stat = 'identity', position = position_dodge())
ggplotly(p6)
t <- (table(credit_score$default, credit_score$education))
t <- as.data.frame(t)
colnames(t) <- c('default', 'education', 'cnt')
p6 <-
ggplot(t, aes(x = default, y = cnt, fill=education)) + geom_bar(stat = 'identity', position = position_dodge())
ggplotly(p6)
t <- (table(credit_score$default, credit_score$marriage))
t <- as.data.frame(t)
colnames(t) <- c('default', 'marriage', 'cnt')
p7 <-
ggplot(t, aes(x = default, y = cnt, fill=marriage)) + geom_bar(stat = 'identity', position = position_dodge())
ggplotly(p7)
Let’s test two hypothesis: - Are the mean credit limits (limit_bal) value for two groups default = 0 (didn’t returned the credit) and default = 1 equal to each other? - Are the distributions of the limit_bal for these two groups also equal to each other?
In order to answer these and the following questions I will calculate confidence intervals.
t.test(limit_bal ~ default, data = credit_score)
##
## Welch Two Sample t-test
##
## data: limit_bal by default
## t = 29.46, df = 10186, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.3673776 0.4197521
## sample estimates:
## mean in group 0 mean in group 1
## 11.75006 11.35649
wilcox.test(limit_bal ~ default, data = credit_score)
##
## Wilcoxon rank sum test with continuity correction
##
## data: limit_bal by default
## W = 95786286, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
These results are obviously practically significant.
Now, lets test another pair of hypothesis: - Are the mean ages and their distributions for these two groups equal to each other?
t.test(age ~ default, credit_score)
##
## Welch Two Sample t-test
##
## data: age by default
## t = -1.2343, df = 10171, p-value = 0.2171
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.011595106 0.002634737
## sample estimates:
## mean in group 0 mean in group 1
## 3.53598 3.54046
wilcox.test(age ~ default, credit_score)
##
## Wilcoxon rank sum test with continuity correction
##
## data: age by default
## W = 76966880, p-value = 0.3725
## alternative hypothesis: true location shift is not equal to 0
The result we received tells us that statistically, mean ages are different, but from the confidence interval value we can see that this difference is hardly practically signigicant.
Now let’s see if the gender composition for the two groups differ.
good <- filter(credit_score, default == 0)
bad <- filter(credit_score, default == 1)
c(ngoodmen, total_good, nbadmen, total_bad) %<-% c(table(good$sex)[1], sum(table(good$sex)), table(bad$sex)[1], sum(table(bad$sex)))
diffscoreci(ngoodmen, total_good, nbadmen, total_bad, conf.level = 0.95)
##
##
##
## data:
##
## 95 percent confidence interval:
## -0.06057240 -0.03366348
That means that men do not return their credits slightly more often (3-6%) than women.
Now, let’s see if the education level impacts default rate. First, calculate table which will show us the sizes of default and no-default groups for each education level, secondly, let’s see how do these sizes differ from the expected ones, next calculate the value of the statistical criteria.
crosstab <- table(credit_score$education, credit_score$default)
crosstab
##
## 0 1
## 0 14 0
## 1 8549 2036
## 2 10700 3330
## 3 3680 1237
## 4 116 7
## 5 262 18
## 6 43 8
crosstab - chisq.test(crosstab)$expected
## Warning in chisq.test(crosstab): Chi-squared approximation may be incorrect
##
## 0 1
## 0 3.0968 -3.0968
## 1 305.4020 -305.4020
## 2 -226.5640 226.5640
## 3 -149.3596 149.3596
## 4 20.2076 -20.2076
## 5 43.9360 -43.9360
## 6 3.2812 -3.2812
chisq.test(crosstab)
## Warning in chisq.test(crosstab): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: crosstab
## X-squared = 163.22, df = 6, p-value < 2.2e-16
assocstats(crosstab)
## X^2 df P(> X^2)
## Likelihood Ratio 184.71 6 0
## Pearson 163.22 6 0
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.074
## Cramer's V : 0.074
Finally, let’s see if the marriage category impacts the default category.
marriage_crosstab <- table(credit_score$marriage, credit_score$default)
marriage_crosstab
##
## 0 1
## 0 49 5
## 1 10453 3206
## 2 12623 3341
## 3 239 84
assocstats(marriage_crosstab)
## X^2 df P(> X^2)
## Likelihood Ratio 36.609 3 5.5663e-08
## Pearson 35.662 3 8.8259e-08
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.034
## Cramer's V : 0.034
For both variables (education and marriage) we see that they statitically significant impact the default category. However, the contigency coefficients (which tells us how strong the features are correlated) are relatively small.